#include <iostream>
#include <fstream>
#include <string>

int main() {
    std::ifstream file("../data/dupRemoveWebPageLib.dat"); // 替换为实际的文件路径
    if (!file.is_open()) {
        std::cerr << "无法打开文件" << std::endl;
        return 1;
    }

    std::string line;
    std::string docid;
    std::string description;
    bool inDoc = false;

    while (std::getline(file, line)) {
        if (line == "<doc>") {
            inDoc = true;
        } else if (line == "</doc>") {
            inDoc = false;
            // 输出当前 doc 的 docid 和 description
            std::cout << "Doc ID: " << docid << std::endl;
            std::cout << "Description: " << description << std::endl;

            
            docid.clear();
            description.clear();
        }

        if (inDoc) {
            if (line.find("<docid>")!= std::string::npos) {
                docid = line.substr(line.find("<docid>") + 7, line.find("</docid>") - line.find("<docid>") - 7);
            } else if (line.find("<description>")!= std::string::npos) {
                description = line.substr(line.find("<description>") + 13, line.find("</description>") - line.find("<description>") - 13);
                if (description.empty()) {
                    description = ""; // 如果 description 为空，赋空值
                }
            }
        }
    }

    file.close();

    return 0;
}